{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xgboost as xgb\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV\n",
    "from sklearn.preprocessing import scale\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from sklearn import model_selection\n",
    "from sklearn.model_selection import train_test_split, KFold, cross_val_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('I:\\Workdata\\\\702525\\Alex\\MA\\python\\manager_characteristics.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = data.query('minyear_manager < 2003')\n",
    "test = data.query('minyear_manager > 2002')\n",
    "#train = data.query('pnr < 88000')\n",
    "#test = data.query('pnr >= 88000')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = train.pe\n",
    "y_test = test.pe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = train.drop(['pe', 'minyear_manager', 'pnr'], axis=1).astype('float64')\n",
    "X_test = test.drop(['pe', 'minyear_manager', 'pnr'], axis=1).astype('float64')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#LASSO\n",
    "\n",
    "lassocv = LassoCV(alphas=np.linspace(0.00005, 0.01, num=1000), cv=10)\n",
    "lassocv.fit(scale(X_train), y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lassocv.alpha_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "coef = pd.Series(lassocv.coef_, index=X_test.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predict = pd.DataFrame(lassocv.predict(scale(X_test)), index=test.pnr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_squared_error(y_test, y_predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = y_predict.merge(test[['pnr', 'pe']], left_index=True, right_on='pnr')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_excel('I:\\Workdata\\\\702525\\Alex\\MA\\python\\predict4.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "coef.to_excel('I:\\Workdata\\\\702525\\Alex\\MA\\python\\coef4.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#XGBoost\n",
    "\n",
    "model = XGBRegressor()\n",
    "model.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predict2 = pd.DataFrame(model.predict(X_test), index=test.pnr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_squared_error(y_test, y_predict2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = y_predict2.merge(test[['pnr', 'pe']], left_index=True, right_on='pnr')\n",
    "df2.to_excel('I:\\Workdata\\\\702525\\Alex\\MA\\python\\predict_xgb.xlsx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "coef2 = pd.Series(model.feature_importances_, index=X_test.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "coef2.to_excel('I:\\Workdata\\\\702525\\Alex\\MA\\python\\coef_xgb.xlsx')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
